This file is almost a duplicate of SimulationStudySummary.Rmd but aimed at comparing simulation study results of Tau and 1/2Tau cases.

09152017

Show the PSJS estimated results for Simulated datasets with estimated tau

realized.tract.dist <- function(L, p){
  x <- 1:(L-1)
  dist.1 <- (2 + (L-1-x)*p)/(1.0/p + L -1.)*(1-p)^(x-1)
  dist.L <- 1/p/(1.0/p + L -1.)*(1-p)^(L-1)
  dist <- c(dist.1, dist.L)
  mean.L <- sum(x * dist.1) + L*dist.L
  var.L <- sum(x^2 * dist.1) + L^2*dist.L - mean.L^2
  return(list(dist = dist, mean = mean.L, sd = sqrt(var.L)))
}
Tract.list <- c(3.0, 10.0, 50.0, 100.0, 300.0)
for(tract in Tract.list){
  target_summary <- get(paste("PSJS_HKY_Tract_", toString(tract), "_combined_summary", sep = ""))
  col.names <- target_summary["tract_length", ] < 10*tract
  sim_info <- get(paste("sim.tract.", toString(tract), sep = ""))
  
  hist(target_summary["tract_length", col.names], breaks = 50,
       main = paste("PSJS Estimated Tract length 1/p, Tract = ", toString(tract), ".0 ", sep = ""))
  #abline(v =  realized.tract.dist(492, 1.0/tract)$mean, col = "blue")
  abline(v =  tract, col = 2)
  abline(v =  mean(sim_info["mean subtract length", ]), col = "green")
  
  HalfTau.target_summary <- get(paste("HalfTau_PSJS_HKY_Tract_", toString(tract), "_combined_summary", sep = ""))
  HalfTau.col.names <- HalfTau.target_summary["tract_length", ] < 10*tract
  HalfTau.sim_info <- get(paste("HalfTau.sim.tract.", toString(tract), sep = ""))
  
  hist(HalfTau.target_summary["tract_length", HalfTau.col.names], breaks = 50,
       main = paste("Half Tau PSJS Estimated Tract length 1/p, Tract = ", toString(tract), ".0 ", sep = ""))
  abline(v =  tract, col = 2)
  
  
  TenthTau.target_summary <- get(paste("TenthTau_PSJS_HKY_Tract_", toString(tract), "_combined_summary", sep = ""))
  TenthTau.col.names <- TenthTau.target_summary["tract_length", ] < 10*tract
  TenthTau.sim_info <- get(paste("TenthTau.sim.tract.", toString(tract), sep = ""))
  
  hist(TenthTau.target_summary["tract_length", TenthTau.col.names], breaks = 50,
       main = paste("Tenth Tau PSJS Estimated Tract length 1/p, Tract = ", toString(tract), ".0 ", sep = ""))
  abline(v =  tract, col = 2)  
  
  print(paste("Tract = ", toString(tract), ".0 combined PSJS HKY Results", sep = ""))
  print("Real value used in simulation: Tau = 22.581687801812059, r2 = 0.53919187294821969, r3 = 11.580030840998282")
  matrix.print <- matrix(c(sum(TenthTau.col.names), sum(HalfTau.col.names), sum(col.names),
                           mean(TenthTau.target_summary["tract_length", TenthTau.col.names]), mean(HalfTau.target_summary["tract_length", HalfTau.col.names]), mean(target_summary["tract_length", col.names]), 
                           sd(TenthTau.target_summary["tract_length", TenthTau.col.names]), sd(HalfTau.target_summary["tract_length", HalfTau.col.names]), sd(target_summary["tract_length", col.names]),
                           # mean(HalfTau.sim_info["mean subtract length", HalfTau.col.names]), mean(sim_info["mean subtract length", col.names]),
                           # sd(HalfTau.sim_info["mean subtract length", HalfTau.col.names]), sd(sim_info["mean subtract length", col.names])),
                           mean(TenthTau.target_summary["tract_length", TenthTau.col.names] * TenthTau.target_summary["init_rate", TenthTau.col.names]), mean(HalfTau.target_summary["tract_length", HalfTau.col.names] * HalfTau.target_summary["init_rate", HalfTau.col.names]), mean(target_summary["tract_length", col.names] * target_summary["init_rate", col.names]),
                           sd(TenthTau.target_summary["tract_length", TenthTau.col.names] * TenthTau.target_summary["init_rate", TenthTau.col.names]), sd(HalfTau.target_summary["tract_length", HalfTau.col.names] * HalfTau.target_summary["init_rate", HalfTau.col.names]), sd(target_summary["tract_length", col.names] * target_summary["init_rate", col.names]),
                           mean(TenthTau.target_summary["r2", TenthTau.col.names]), mean(HalfTau.target_summary["r2", HalfTau.col.names]), mean(target_summary["r2", col.names]),
                           sd(TenthTau.target_summary["r2", TenthTau.col.names]), sd(HalfTau.target_summary["r2", HalfTau.col.names]), sd(target_summary["r2", col.names]),
                           mean(TenthTau.target_summary["r3", TenthTau.col.names]), mean(HalfTau.target_summary["r3", HalfTau.col.names]), mean(target_summary["r3", col.names]),
                           sd(TenthTau.target_summary["r3", TenthTau.col.names]), sd(HalfTau.target_summary["r3", HalfTau.col.names]), sd(target_summary["r3", col.names])
  ),
  3, 9)
  colnames(matrix.print) <- c("Total samples","mean tract length","sd tract length","mean Tau","sd Tau", 
                              "mean r2", "sd r2", "mean r3", "sd r3")
  rownames(matrix.print) <- c("TenthTau", "HalfTau", "Tau")
  print("Now showing summary of estimates")
  print(matrix.print)  
  
}

## [1] "Tract = 3.0 combined PSJS HKY Results"
## [1] "Real value used in simulation: Tau = 22.581687801812059, r2 = 0.53919187294821969, r3 = 11.580030840998282"
## [1] "Now showing summary of estimates"
##          Total samples mean tract length sd tract length  mean Tau
## TenthTau            91          3.283349        3.312316  2.356514
## HalfTau            100          2.922520        1.113955 12.200990
## Tau                100          2.793908        1.038238 25.032230
##            sd Tau   mean r2     sd r2  mean r3    sd r3
## TenthTau 1.656687 0.6051389 0.2798352 12.39054 3.558901
## HalfTau  4.434941 0.5855221 0.2439452 12.77058 3.795465
## Tau      8.287523 0.5879993 0.2835385 13.20915 4.000814

## [1] "Tract = 10.0 combined PSJS HKY Results"
## [1] "Real value used in simulation: Tau = 22.581687801812059, r2 = 0.53919187294821969, r3 = 11.580030840998282"
## [1] "Now showing summary of estimates"
##          Total samples mean tract length sd tract length  mean Tau
## TenthTau            98         11.537135       14.047343  2.505714
## HalfTau            100         10.000468        4.353661 12.146068
## Tau                 99          8.624418        3.405958 24.368881
##            sd Tau   mean r2     sd r2  mean r3    sd r3
## TenthTau 1.964019 0.5838685 0.2805556 12.38993 3.309168
## HalfTau  4.777245 0.5890073 0.2211978 12.26681 3.183922
## Tau      7.349466 0.5496428 0.2215865 12.09983 2.812596

## [1] "Tract = 50.0 combined PSJS HKY Results"
## [1] "Real value used in simulation: Tau = 22.581687801812059, r2 = 0.53919187294821969, r3 = 11.580030840998282"
## [1] "Now showing summary of estimates"
##          Total samples mean tract length sd tract length  mean Tau
## TenthTau            98          87.35219       152.55136  2.633865
## HalfTau             98          34.97746        17.35117 12.700596
## Tau                 99          37.33633        18.09137 25.787994
##             sd Tau   mean r2     sd r2  mean r3    sd r3
## TenthTau  2.644350 0.6531514 0.2840591 12.77153 3.485084
## HalfTau   7.863899 0.5979565 0.2700079 12.82955 3.631304
## Tau      14.612228 0.5890735 0.2289175 12.32934 3.922138

## [1] "Tract = 100.0 combined PSJS HKY Results"
## [1] "Real value used in simulation: Tau = 22.581687801812059, r2 = 0.53919187294821969, r3 = 11.580030840998282"
## [1] "Now showing summary of estimates"
##          Total samples mean tract length sd tract length  mean Tau
## TenthTau            99         114.38781       175.91309  3.606949
## HalfTau             98          59.38336        56.16456 13.671732
## Tau                 98          63.16095        30.73807 27.120792
##             sd Tau   mean r2     sd r2  mean r3    sd r3
## TenthTau  5.114314 0.5972691 0.2476132 12.86846 3.618311
## HalfTau   9.658842 0.5570442 0.2107733 12.23641 3.095682
## Tau      22.528654 0.6138259 0.3042493 12.81388 4.645047

## [1] "Tract = 300.0 combined PSJS HKY Results"
## [1] "Real value used in simulation: Tau = 22.581687801812059, r2 = 0.53919187294821969, r3 = 11.580030840998282"
## [1] "Now showing summary of estimates"
##          Total samples mean tract length sd tract length  mean Tau
## TenthTau            99          180.6429        209.6309  2.927074
## HalfTau             99          167.4497        215.7783 14.947028
## Tau                 97          180.2817        291.7673 30.643917
##             sd Tau   mean r2     sd r2  mean r3    sd r3
## TenthTau  5.246772 0.5541338 0.2031755 12.35401 2.981998
## HalfTau  12.247934 0.5722834 0.2303773 12.26209 3.317833
## Tau      20.181477 0.6089437 0.2580605 12.39773 3.244111

10/15/2017

Now see if the bias can be corrected by analyzing each half of the alignment Correction uses n choose 2 as sample size

Tract.list <- c(50.0, 100.0, 300.0)
n1 <- choose(489, 2)
n2 <- choose(243, 2)
n3 <- choose(246, 2)
for(tract in Tract.list){
  first.half.summary <- get(paste("first_half_PSJS_HKY_Tract_", tract, "_combined_summary", sep = ""))
  second.half.summary <- get(paste("second_half_PSJS_HKY_Tract_", tract, "_combined_summary", sep = ""))
  target.summary <- get(paste("PSJS_HKY_Tract_", tract, "_combined_summary", sep = ""))
  
  col.names <- intersect(intersect(colnames(first.half.summary), colnames(second.half.summary)), colnames(target.summary))
  
  first.half.corrected <- (n1 - n2) /(n1 * 1./target.summary["tract_length", col.names] - n2 * 1.0/first.half.summary["tract_length", col.names]) 
  second.half.corrected <-  (n1 - n3) / (n1 * 1./target.summary["tract_length", col.names] - n3 * 1./second.half.summary["tract_length", col.names])
  
  # first.half.corrected[first.half.corrected<target.summary["tract_length", col.names]] <- target.summary["tract_length",col.names[first.half.corrected<target.summary["tract_length", col.names]]]
  # second.half.corrected[second.half.corrected<target.summary["tract_length", col.names]] <- target.summary["tract_length",col.names[second.half.corrected<target.summary["tract_length", col.names]]]
  # 
  hist(target.summary["tract_length", col.names], breaks = 50,
       main = paste("PSJS Estimated Tract length, Tract = ", toString(tract), ".0 ", sep = ""))
  hist(first.half.corrected, breaks = 50,
       main = paste("First half corrected PSJS Estimated Tract length, Tract = ", toString(tract), ".0 ", sep = ""))
  hist(second.half.corrected, breaks = 50,
       main = paste("Second half corrected PSJS Estimated Tract length, Tract = ", toString(tract), ".0 ", sep = ""))   
  hist((first.half.corrected + second.half.corrected) / 2, breaks = 50,
       main = paste("Both half corrected PSJS Estimated Tract length, Tract = ", toString(tract), ".0 ", sep = ""))
  
  plot(target.summary["tract_length", col.names], (first.half.corrected + second.half.corrected) / 2, 
       xlab = "PSJS estimated", ylab = "Both half corrected", 
       main = paste("Both half corrected vs PSJS Estimated Tract length, Tract = ", toString(tract), ".0 ", sep = ""))
  abline(a = 0, b = 1)
  abline(h = tract)
  
  n.list <- c(rep(n1, length(col.names)), rep(n2, length(col.names)), rep(n3, length(col.names)))
  ni.theta <- c(n1*log(target.summary["tract_length", col.names]), n2*log(first.half.summary["tract_length", col.names]), n3*log(second.half.summary["tract_length", col.names]))
  theta.list <- c(log(target.summary["tract_length", col.names]), log(first.half.summary["tract_length", col.names]), log(second.half.summary["tract_length", col.names]))
  plot(n.list, ni.theta, 
       main = paste("-ni*log(p) vs ni, Tract = ", toString(tract), ".0 ", sep = ""))
  # OK, this violates the constant variance assumptiong of linear regression
  fit <- lm(ni.theta ~ n.list)
  y <- ni.theta / sqrt(n.list)
  x1 <- sqrt(n.list)
  x2 <- 1/sqrt(n.list)
  fit2 <- lm(y~0+x1+x2)
  abline(a = fit$coefficients[1], b = fit$coefficients[2], col = "red")
  
  plot(1/n.list, theta.list, 
       main = paste("-log(p) vs 1/ni, Tract = ", toString(tract), ".0 ", sep = ""))
  x3 <- 1/n.list
  fit3 <- lm(theta.list~x3)
  abline(a=fit3$coefficients[1], b=fit3$coefficients[2], col="red")
  
  print(fit$coefficients)
  print(c(exp(fit$coefficients[2]), exp(fit2$coefficients[1]), exp(fit3$coefficients[1])))
  print(c(mean(target.summary["tract_length", col.names])/exp(fit$coefficients[1]/n1),
          mean(target.summary["tract_length", col.names])/exp(fit2$coefficients[2]/n1),
          mean(target.summary["tract_length", col.names])/exp(fit3$coefficients[2]/n1)))
      
  print.matrix <- matrix(c(mean(target.summary["tract_length", col.names]), sd(target.summary["tract_length", col.names]), 
                           mean(first.half.corrected), sd(first.half.corrected), 
                           mean(second.half.corrected), sd(second.half.corrected),
                           mean((first.half.corrected + second.half.corrected) / 2), 
                           sd(first.half.corrected + second.half.corrected) / 2), 
                         4, 2, byrow = TRUE) 
  colnames(print.matrix) <- c("mean", "sd")
  rownames(print.matrix) <- c("PSJS estimated", "Corrected using first half", 
                              "Corrected using second half", "Corrected using both")
  print(print.matrix)
}

##  (Intercept)       n.list 
## -9019.770188     3.574033 
##      n.list          x1 (Intercept) 
##    35.66014    35.68844    35.77358 
## (Intercept)          x2          x3 
##    39.23174    39.24729    39.27842 
##                                 mean       sd
## PSJS estimated              36.37531 16.56685
## Corrected using first half  27.47277 58.83801
## Corrected using second half 44.38371 32.16274
## Corrected using both        35.92824 34.14067

##  (Intercept)       n.list 
## -9373.163328     4.116731 
##      n.list          x1 (Intercept) 
##    61.35833    61.32944    61.24278 
## (Intercept)          x2          x3 
##    68.68128    68.66512    68.63282 
##                                 mean        sd
## PSJS estimated              63.49233  31.42831
## Corrected using first half  42.72633 211.68845
## Corrected using second half 45.93008 194.32619
## Corrected using both        44.32820 144.13719

## (Intercept)      n.list 
## 2681.103380    4.692135 
##      n.list          x1 (Intercept) 
##    109.0859    109.0523    108.9514 
## (Intercept)          x2          x3 
##    246.2427    246.2047    246.1290 
##                                  mean        sd
## PSJS estimated               251.8385  746.9723
## Corrected using first half   604.9041 3271.9091
## Corrected using second half -220.5940 2831.3043
## Corrected using both         192.1551 1885.2528

Now see if the bias can be corrected by analyzing each half of the alignment Correction uses n as sample size

Tract.list <- c(50.0, 100.0, 300.0)
n1 <- 489
n2 <- 243
n3 <- 246
for(tract in Tract.list){
  first.half.summary <- get(paste("first_half_PSJS_HKY_Tract_", tract, "_combined_summary", sep = ""))
  second.half.summary <- get(paste("second_half_PSJS_HKY_Tract_", tract, "_combined_summary", sep = ""))
  target.summary <- get(paste("PSJS_HKY_Tract_", tract, "_combined_summary", sep = ""))
  
  col.names <- intersect(intersect(colnames(first.half.summary), colnames(second.half.summary)), colnames(target.summary))
  
  first.half.corrected <- (n1 - n2) /(n1 * 1./target.summary["tract_length", col.names] - n2 * 1.0/first.half.summary["tract_length", col.names]) 
  second.half.corrected <-  (n1 - n3) / (n1 * 1./target.summary["tract_length", col.names] - n3 * 1./second.half.summary["tract_length", col.names])
  
  # first.half.corrected[first.half.corrected<target.summary["tract_length", col.names]] <- target.summary["tract_length",col.names[first.half.corrected<target.summary["tract_length", col.names]]]
  # second.half.corrected[second.half.corrected<target.summary["tract_length", col.names]] <- target.summary["tract_length",col.names[second.half.corrected<target.summary["tract_length", col.names]]]
  
  hist(target.summary["tract_length", col.names], breaks = 50,
       main = paste("PSJS Estimated Tract length, Tract = ", toString(tract), ".0 ", sep = ""))
  hist(first.half.corrected, breaks = 50,
       main = paste("First half corrected PSJS Estimated Tract length, Tract = ", toString(tract), ".0 ", sep = ""))
  hist(second.half.corrected, breaks = 50,
       main = paste("Second half corrected PSJS Estimated Tract length, Tract = ", toString(tract), ".0 ", sep = ""))   
  hist((first.half.corrected + second.half.corrected) / 2, breaks = 50,
       main = paste("Both half corrected PSJS Estimated Tract length, Tract = ", toString(tract), ".0 ", sep = ""))
  
  plot(target.summary["tract_length", col.names], (first.half.corrected + second.half.corrected) / 2, 
       xlab = "PSJS estimated", ylab = "Both half corrected", 
       main = paste("Both half corrected vs PSJS Estimated Tract length, Tract = ", toString(tract), ".0 ", sep = ""))
  abline(a = 0, b = 1)
  abline(h = tract)
  
  n.list <- c(rep(n1, length(col.names)), rep(n2, length(col.names)), rep(n3, length(col.names)))
  ni.theta <- c(n1*log(target.summary["tract_length", col.names]), n2*log(first.half.summary["tract_length", col.names]), n3*log(second.half.summary["tract_length", col.names]))
  theta.list <- c(log(target.summary["tract_length", col.names]), log(first.half.summary["tract_length", col.names]), log(second.half.summary["tract_length", col.names]))
  plot(n.list, ni.theta, 
       main = paste("-ni*log(p) vs ni, Tract = ", toString(tract), ".0 ", sep = ""))

  # OK, this violates the constant variance assumptiong of linear regression
  fit <- lm(ni.theta ~ n.list)
  y <- ni.theta / sqrt(n.list)
  x1 <- sqrt(n.list)
  x2 <- 1/sqrt(n.list)
  fit2 <- lm(y~0+x1+x2)
  abline(a = fit$coefficients[1], b = fit$coefficients[2], col = "red")
  
  plot(1/n.list, theta.list, 
       main = paste("-log(p) vs 1/ni, Tract = ", toString(tract), ".0 ", sep = ""))
  x3 <- 1/n.list
  fit3 <- lm(theta.list~x3)
  abline(a=fit3$coefficients[1], b=fit3$coefficients[2], col="red")
  
  print(fit$coefficients)
  print(c(exp(fit$coefficients[2]), exp(fit2$coefficients[1]), exp(fit3$coefficients[1])))
  print(c(mean(target.summary["tract_length", col.names])/exp(fit$coefficients[1]/n1),
          mean(target.summary["tract_length", col.names])/exp(fit2$coefficients[2]/n1),
          mean(target.summary["tract_length", col.names])/exp(fit3$coefficients[2]/n1)))
  
  
  print.matrix <- matrix(c(mean(target.summary["tract_length", col.names]), sd(target.summary["tract_length", col.names]), 
                           mean(first.half.corrected), sd(first.half.corrected), 
                           mean(second.half.corrected), sd(second.half.corrected),
                           mean((first.half.corrected + second.half.corrected) / 2), 
                           sd(first.half.corrected + second.half.corrected) / 2), 
                         4, 2, byrow = TRUE) 
  colnames(print.matrix) <- c("mean", "sd")
  rownames(print.matrix) <- c("PSJS estimated", "Corrected using first half", 
                              "Corrected using second half", "Corrected using both")
  print(print.matrix)
}

## (Intercept)      n.list 
## -111.603087    3.726936 
##      n.list          x1 (Intercept) 
##    41.55159    41.60182    41.68563 
## (Intercept)          x2          x3 
##    45.70088    45.73770    45.79297 
##                                  mean        sd
## PSJS estimated              36.375314  16.56685
## Corrected using first half   6.445536 190.26919
## Corrected using second half 25.431103 266.56870
## Corrected using both        15.938320 161.22476

## (Intercept)      n.list 
## -115.339877    4.273891 
##      n.list          x1 (Intercept) 
##    71.80047    71.75102    71.66871 
## (Intercept)          x2          x3 
##    80.38182    80.34491    80.28960 
##                                 mean        sd
## PSJS estimated              63.49233  31.42831
## Corrected using first half  62.39591 675.89787
## Corrected using second half 47.71725 190.24342
## Corrected using both        55.05658 349.87868

## (Intercept)      n.list 
##   33.209189    4.646589 
##      n.list          x1 (Intercept) 
##    104.2288    104.1801    104.0990 
## (Intercept)          x2          x3 
##    235.3034    235.2300    235.1201 
##                                    mean       sd
## PSJS estimated              251.8385122 746.9723
## Corrected using first half   -0.6239804 915.4536
## Corrected using second half  50.2867915 637.8758
## Corrected using both         24.8314055 572.5871

10162017 plot \({n_i}\hat \theta\) vs \(n_i\)